%Code for Cao, Hansen, Kozbur and Villacorta, (2021) " Inference for Dependent Data with Learned Clusters"

%The code main_empirical_application_2021.m produces Table 1 and Table 2 in
%Section 3 of the paper.
%% Clean things up
clear;
clc;

restoredefaultpath
addpath('functions');
addpath('data');
warning('off','MATLAB:nearlySingularMatrix');
warning('off','all');

%% Import election violence data
eldata = importdata('ElectionViolenceTABLE2.csv');

%% Define variables
% Outcomes
Y1 = eldata.data(:,1);
Y2 = eldata.data(:,2);
Y3 = eldata.data(:,3);

Y=Y1;

% Treatment (endogenous)
D = eldata.data(:,4);

% Instrument 
Z = eldata.data(:,5);

%Set of covariates
POP = eldata.data(:,6);
ELE = eldata.data(:,7);

ID = eldata.data(:,8);
XCoord = eldata.data(:,9);
YCoord = eldata.data(:,10);

CONT1 = [eldata.data(:,11:12) eldata.data(:,15:20)];
CONT2 = [eldata.data(:,13:14) eldata.data(:,21:26)];
CONT3 = eldata.data(:,27);

nT = numel(D);
n = numel(unique(ID));


%% Replicate results: Table 2 Condra et al (2018)
% Column 1
X1 = [D POP ELE ones(nT,1)];
b1 = X1\Y1;
s1 = cluster_se(X1,Y1 - X1*b1,inv(X1'*X1),recode(ID));

% Column 2
X2 = [D POP ELE CONT1 ones(nT,1)];
Z2 = [Z POP ELE CONT1 ones(nT,1)];
b2 = (Z2'*X2)\(Z2'*Y1);
s2 = cluster_se(Z2,Y1 - X2*b2,inv(Z2'*X2),recode(ID));
% First stage
pi2 = Z2\D;
spi2 = cluster_se(Z2,D - Z2*pi2,inv(Z2'*Z2),recode(ID));

% Column 3
X3 = [D zscore(POP) ELE CONT1 CONT2 ones(nT,1)];
Z3 = [Z zscore(POP) ELE CONT1 CONT2 ones(nT,1)];
b3 = (Z3'*X3)\(Z3'*Y1);
s3 = cluster_se(Z3,Y1 - X3*b3,inv(Z3'*X3),recode(ID));
% First stage
pi3 = Z3\D;
spi3 = cluster_se(Z3,D - Z3*pi3,inv(Z3'*Z3),recode(ID));

% Column 4
X4 = [D zscore(POP) ELE CONT1 CONT2 CONT3 ones(nT,1)];
Z4 = [Z zscore(POP) ELE CONT1 CONT2 CONT3 ones(nT,1)];
b4 = (Z4'*X4)\(Z4'*Y1);
s4 = cluster_se(Z4,Y1 - X4*b4,inv(Z4'*X4),recode(ID));
% First stage
pi4 = Z4\D;
spi4 = cluster_se(Z4,D - Z4*pi4,inv(Z4'*Z4),recode(ID));

Xbase = [zscore(POP) ELE CONT1 CONT2 CONT3 ones(nT,1)];

% Column 5
b5 = (Z4'*X4)\(Z4'*Y2);
s5 = cluster_se(Z4,Y2 - X4*b5,inv(Z4'*X4),recode(ID));

% Column 6
b6 = (Z4'*X4)\(Z4'*Y3);
s6 = cluster_se(Z4,Y3 - X4*b6,inv(Z4'*X4),recode(ID));

fprintf('\n                      Reproduce Results\n');
fprintf('              (2)        (3)        (4)        (5)        (6)\n');
fprintf('IV          %2.3f     %2.3f     %2.3f     %2.3f     %2.3f \n',...
    b2(1),b3(1),b4(1),b5(1),b6(1));
fprintf(['s.e.         %2.3f      %2.3f      %2.3f      %2.3f',...
    '      %2.3f \n'],...
    s2(1),s3(1),s4(1),s5(1),s6(1));
fprintf('First Stage  %2.3f      %2.3f      %2.3f \n',pi2(1),pi3(1),...
    pi4(1));
fprintf('s.e.         %2.3f      %2.3f      %2.3f \n\n\n',spi2(1),...
    spi3(1),spi4(1));


%% Some Moran tests
method = 'iv';
data.X = Z4(:,2:end);
data.D = D;
data.Z = Z;
data.Y = Y1;
SCORE = score_estimation(method,data);

W0 = spatialkNNSym(squareform(pdist([XCoord(ELE == 0),YCoord(ELE == 0)])),2);
W1 = spatialkNNSym(squareform(pdist([XCoord(ELE == 1),YCoord(ELE == 1)])),2);
moran0 = moran(W0,SCORE(ELE == 0));
moran1 = moran(W1,SCORE(ELE == 1));

WA = spatialkNNSym(squareform(pdist([XCoord,YCoord,10000*ELE])),2);
moranA = moran(WA,SCORE);

fprintf('\n (Spatial) Moran test T = 0 (p-value): %2.5f (%2.5f) \n',moran0,2*normcdf(-abs(moran0)));
fprintf('\n (Spatial) Moran test T = 1 (p-value): %2.5f (%2.5f) \n',moran1,2*normcdf(-abs(moran1)));
fprintf('\n (Spatial) Moran test All (p-value): %2.5f (%2.5f) \n',moranA,2*normcdf(-abs(moranA)));

WT = (dummyvar(recode(ID))*dummyvar(recode(ID))')-eye(2*n);
moranT = moran(WT,SCORE);

fprintf('\n (Intertemporal) Moran test (p-value): %2.5f (%2.5f) \n',moranT,2*normcdf(-abs(moranT)));


%% Clustering: k-Medoids

% Upper bound in the number of clusters
G=8 

rng(7)

% Create clusters
clustsMed = zeros(n,G-1);
for jj = 2:G 
    clustsMed(:,jj-1) = kmedoids([XCoord(1:2:end),YCoord(1:2:end)], jj);
end
clustsMedPan = kron(clustsMed,ones(2,1));


%% Inferencial procedures based on learned clusters
%==============================================================================================
% Apply inferencial procedures based on learned clusters to Condra et al (2018)
%===============================================================================================
% X_C=X4 without D
X = [zscore(POP) ELE CONT1 CONT2 CONT3 ones(nT,1)]; % 
Z=Z;
D=D;
Y=Y1;

p=size(X,2)-1; % p is the number of covariates exlcuding the intercept 
M = eye(nT) - X*((X'*X)\X');
[~,~,ex] = qr(M,'vector');
useQML = ex(1:end-p-1); 

mZ = Z - X*(X\Z);

% First stage estimates (full sample)
fscoef = [Z X]\D;
%Residual FS
vhat = D - [Z X]*fscoef;
    
% IV estimate
mD = D - X*(X\D);
mY = Y - X*(X\Y);
ahat = (mZ'*mD)\(mZ'*mY);
% Residual
uhat = mY-mD*ahat;


bb=1;
%set of alternatives to compute power
poweralts = -2:.001:2;
npalts = numel(poweralts);

 
% Cluster by location
    group_location=recode(ID);
    seLocation(bb,1) = cluster_se(mZ,uhat,1/(mZ'*mD),group_location,p+2);    
    
% Cluster by k-medoids groupings
    group_matrix_km=clustsMedPan;
    for kk = 1:G-1 
        seG(bb,kk) = cluster_se(mZ,uhat,1/(mZ'*mD),group_matrix_km(:,kk),p+2);
    end
       
    
    
% IM and CRS
    for kk = 1:G-1 
        [akk,sekk,btempkk] = FamaMacbeth(D,X,Y,Z,group_matrix_km(:,kk));
        ahatIM(bb,kk) = akk(1);
        seIM(bb,kk) = sekk(1);
        btemp = btempkk(:,1);
        pvalCRS(bb,kk) = FamaMacbethRand(btemp,0);
        for jj = 1:npalts
            pvalpowerCRS(bb,jj,kk) = FamaMacbethRand(btemp,poweralts(jj));
        end
    end


%% Bootstrap based on parametric covariance estimation and optimal number of clusters

%==========================================================================
% Step 1: Estimate parametric model - Homogeneous exponential
%===========================================================================
% Distance Measures
%time_mat = squareform(pdist(kron(ones(n,1),[1;2])));
Panel.dGeo = squareform(pdist([XCoord,YCoord]));
Panel.dTime = squareform(pdist(ELE));

%Heteroskedastic model
%X_Z= [Z X(:,1) X(:,2:12) log(X(:,13:15)) X(:,16:19) ones(nT,1)];
% Homocedastic model
X_Z= [ones(nT,1)];
pp=size(X_Z,2);

% Adjusting the QML for estimated residuals
 Sigma_func = @(w) M(useQML,:)*(((diag(sqrt(exp(X_Z*w(1:pp)))))*exp(-Panel.dGeo/exp(w(pp+1))- Panel.dTime/exp(w(pp+2)))*(diag(sqrt(exp(X_Z*w(1:pp)))))))*(M(useQML,:)');
 
 QU = @(w) .5*logdet(Sigma_func(w))+.5*uhat(useQML,1)'*(Sigma_func(w)\uhat(useQML,1));
 QV = @(w) .5*logdet(Sigma_func(w))+.5*vhat(useQML,1)'*(Sigma_func(w)\vhat(useQML,1));
     
    options = optimoptions('fminunc','display','off');
    alpha_init = zeros(pp+2,1);
    
    alphaHatU = fminunc(QU,alpha_init,options);
    alphaHatV = fminunc(QV,alpha_init,options);


    Sigma_func_DGP = @(w) (diag(sqrt(exp(X_Z*w(1:pp)))))*exp(-Panel.dGeo/exp(w(pp+1))- Panel.dTime/exp(w(pp+2)))*(diag(sqrt(exp(X_Z*w(1:pp)))));
    
    SigmaHatU = Sigma_func_DGP(alphaHatU);
    SigmaHatV = Sigma_func_DGP(alphaHatV);
    
     if min(eig(SigmaHatU)) < 0 || min(eig(SigmaHatV)) < 0
        error('Non psd covariance matrix');
     end
        
    uhatstar = sqrtm(SigmaHatU)\uhat;
    vhatstar = sqrtm(SigmaHatV)\vhat;
    CSHatU = sqrtm(SigmaHatU);
    CSHatV = sqrtm(SigmaHatV);
    
%      uhatstar = (chol(SigmaHatU))\uhat;
%      vhatstar = (chol(SigmaHatV))\vhat;
%      CSHatU = chol(SigmaHatU);
%      CSHatV = chol(SigmaHatV);
    
    rhohat = corr(uhatstar,vhatstar)
    %rhohat = corr(uhat,vhat);
    
    qmle = [exp(alphaHatU(1:3))' exp(alphaHatU(4:end))' exp(alphaHatV(1:3))' exp(alphaHatV(4:end))' rhohat]; 
    qmle_U = [alphaHatU(1:pp)' exp(alphaHatU(pp+1:end))']'
    qmle_V = [alphaHatV(1:pp)' exp(alphaHatV(pp+1:end))']'
      
 %==========================================================================
 % Step 2: Resample and re-estimate
 %==========================================================================
    rng(91720);
    Bboot = 10000;   
    alternatives =  [(-10:1:-1) (1:1:10)]'/sqrt(nT); ;
    nalt = numel(alternatives);
    
    aboot = zeros(Bboot,1);
    abootIM = zeros(Bboot,1);
    
    sboot = zeros(Bboot,G-1);
    sebootIM = zeros(Bboot,G-1);
    sloc = zeros(Bboot,1);
    
    pbootCRS = zeros(Bboot,G-1);
    pbootpowerCRS = zeros(Bboot,nalt,G-1);
        
    for rr = 1:Bboot
        if ~mod(rr,100)
            fprintf('Replication %d \n',rr);
        end
        
        errboot = mvnrnd([0 0],[1 rhohat ; rhohat 1],nT);
        Uboot = CSHatU'*errboot(:,1);
        Vboot = CSHatV'*errboot(:,2);        
        
        Dboot = [Z X]*fscoef + Vboot;
        %Yboot = Uboot;
        Yboot = Uboot+X*b4(2:end);

        mDboot = Dboot - X*(X\Dboot);
        mYboot = Yboot - X*(X\Yboot);
        
        aboot(rr,1) = (mZ'*mDboot)\(mZ'*mYboot);
        residboot = mYboot-mDboot*aboot(rr,1);
        
        sloc(rr,1) = cluster_se(mZ,residboot,1/(mZ'*mDboot),group_location,p+2); %p+2?
        
        % Cluster by k-medoids groupings
        for kk = 1:G-1
            sboot(rr,kk) = cluster_se(mZ,residboot,1/(mZ'*mDboot),group_matrix_km(:,kk),p+2);
        end
        
         % IM and CRS
        for kk = 1:G-1
            [akk,sekk,btempkk] = FamaMacbeth(Dboot,X,Yboot,Z,group_matrix_km(:,kk));
            abootIM(rr,kk) = akk(1);
            sebootIM(rr,kk) = sekk(1);
            btemp = btempkk(:,1);
            pbootCRS(rr,kk) = FamaMacbethRand(btemp,0);
            for jj = 1:nalt
                pbootpowerCRS(rr,jj,kk) = FamaMacbethRand(btemp,alternatives(jj));
            end
        end
        
    end
    
    
    %=============================================================================
    % Step 3: Bootstrap size/power and optimization of power subject to size control
    %=============================================================================
    
    % Critical values that would control size (in bootstrap)
    
    wap = zeros(G,1);
    wapIM = zeros(G-1,1);
    wapCRS = zeros(G-1,1);
    
    cvalLocation(bb,1) = max(tinv(.975,max(group_location)-1),prctile(abs(aboot./sloc),95));
    
    wap(1) = mean(mean(abs(aboot*ones(1,nalt)-ones(Bboot,1)*alternatives')...
        ./(sloc*ones(1,nalt)) > cvalLocation(bb,1)));
    
    for kk = 1:G-1
        cvalG(bb,kk) = max(tinv(.975,max(group_matrix_km(:,kk))-1),prctile(abs(aboot./sboot(:,kk)),95));
        wap(1+kk) = mean(mean(abs(aboot*ones(1,nalt)-ones(Bboot,1)*alternatives')...
            ./(sboot(:,kk)*ones(1,nalt)) > cvalG(bb,kk)));
        cvalIM(bb,kk) = max(tinv(.975,max(group_matrix_km(:,kk))-1),prctile(abs(abootIM(:,kk)./sebootIM(:,kk)),95));
        wapIM(kk) = mean(mean(abs(abootIM(:,kk)*ones(1,nalt)-ones(Bboot,1)*alternatives')...
            ./(sebootIM(:,kk)*ones(1,nalt)) > cvalIM(bb,kk)));
        tmp = [0;unique(pbootCRS(:,kk))];
        tmpu = tmp(tmp <= .05);
        if numel(tmpu) == 1
            pvalsimCRS(bb,kk) = .05;
            
        else
            tmps = zeros(numel(tmpu),1);
            for mm = 1:numel(tmpu)
                tmps(mm) = mean(pbootCRS(:,kk) <= tmp(mm));
            end
            tmpind = find(tmps <= .05);
            %pvalsimCRS(bb,kk) = tmpu(max(tmpind));
            pvalsimCRS(bb,kk) = min(quantile(pbootCRS(:,kk),.05),.05)  %% new Jianfei           
        end
          wapCRS(kk) = mean(mean(pbootpowerCRS(:,:,kk) <= pvalsimCRS(bb,kk)));
    end
 
    %========================================================================
    % Step 4: Select optimal number of clusters
    %========================================================================
    [~,indA] = max(wap);
    if indA == 1
        seGstarA(bb,1) = seLocation(bb,1);
        cvalGstarA(bb,1) = cvalLocation(bb,1);
        GstarA(bb,1) = max(group_location);        
    else
        seGstarA(bb,1) = seG(bb,indA-1);
        cvalGstarA(bb,1) = cvalG(bb,indA-1);
        GstarA(bb,1) = max(group_matrix_km(:,indA-1));
    end
      
    [~,indB] = max(wap(2:end));
    seGstarB(bb,1) = seG(bb,indB);
    cvalGstarB(bb,1) = cvalG(bb,indB);
    GstarB(bb,1) = max(group_matrix_km(:,indB));
    
    [~,indB] = max(wapIM);
    ahatIMstarB(bb,1) = ahatIM(bb,indB);
    seIMstarB(bb,1) = seIM(bb,indB);
    cvalIMstarB(bb,1) = cvalIM(bb,indB);
    IMstarB(bb,1) = max(group_matrix_km(:,indB));

    indB = max(find(wapCRS == max(wapCRS)));
    pvalCRSstarB(bb,1) = pvalCRS(bb,indB);
    pvalsimCRSstarB(bb,1) = pvalsimCRS(bb,indB);
    pvalpowerCRSstarB(bb,:) = pvalpowerCRS(bb,:,indB);
    CRSstarB(bb,1) = max(group_matrix_km(:,indB));
    ahatCRSstarB(bb,1) = ahatIM(bb,indB); %when CRSstarB is different from IMstarB
    
%===============================================================================    
    %% Results
    %tstat
    tstat_loc=abs(ahat)./seLocation;
    tstat_CCE=abs(ahat)./seG;
    tstat_IM=abs(ahatIM)./seIM;
    
    tstat_CCE_Gopt=abs(ahat)/seGstarB;
    tstat_IM_Gopt=abs(ahatIMstarB)/seIMstarB;
    
    % CI usual
    cv_loc=tinv(.975,max(group_location)-1);
    cv_CCE=tinv(.975,max(group_matrix_km)-1);
    cv_IM=tinv(.975,max(group_matrix_km)-1);   
    
    CI_lower_loc=ahat-cv_loc*seLocation;
    CI_uppers_loc=ahat+cv_loc*seLocation;
    
    CI_lowerCCE=ahat-cv_CCE.*seG;
    CI_upperCCE=ahat+cv_CCE.*seG;
    
    CI_lowerIM=ahatIM-cv_IM.*seIM;
    CI_upperIM=ahatIM+cv_IM.*seIM;
    
    CI_lowerCCE_Gopt=CI_lowerCCE(GstarB-1);
    CI_upperCCE_Gopt=CI_upperCCE(GstarB-1);
    
    CI_lowerIM_Gopt=CI_lowerIM(IMstarB-1);
    CI_upperIM_Gopt=CI_upperIM(IMstarB-1);
        
    pvalnull_CRS=pvalpowerCRS(bb,:,CRSstarB-1)';
    KM_ciCRS = [poweralts(find(pvalnull_CRS > .05,1)) poweralts(find(pvalnull_CRS > .05,1,'last'))];
   
    % CI simulated critical values
    
    CI_lowerloc_SCV=ahat-cvalLocation*seLocation;
    CI_upperloc_SCV=ahat+cvalLocation*seLocation;
    
    CI_lowerCCE_SCV=ahat-cvalG.*seG;
    CI_upperCCE_SCV=ahat+cvalG.*seG;
    
    CI_lowerIM_SCV=ahatIM-cvalIM.*seIM;
    CI_upperIM_SCV=ahatIM+cvalIM.*seIM;
    
    CI_lowerCCE_Gopt_SCV=CI_lowerCCE_SCV(GstarB-1);
    CI_upperCCE_Gopt_SCV=CI_upperCCE_SCV(GstarB-1);
    
    CI_lowerIM_Gopt_SCV=CI_lowerIM_SCV(IMstarB-1);
    CI_upperIM_Gopt_SCV=CI_upperIM_SCV(IMstarB-1);
    
    p_null_CRS_SCV=pvalsimCRS;
    p_null_CRS_Gopt_SCV=p_null_CRS_SCV(CRSstarB-1);
    KM_ciCRS_SCV = [poweralts(find(pvalnull_CRS > p_null_CRS_Gopt_SCV,1)) poweralts(find(pvalnull_CRS > p_null_CRS_Gopt_SCV,1,'last'))]
       
    
    % Table usual CV
    
    table_inference = zeros(4,8);
    table_inference(:,1) = [ahat;ahat;...
    ahatIMstarB; ahatCRSstarB];

    table_inference(1:3,2) = [seLocation;seGstarB;...
    seIMstarB];
    
    table_inference(1:3,3) = [tstat_loc;tstat_CCE_Gopt;...
    tstat_IM_Gopt];

    table_inference(:,4:5) = [CI_lowerloc_SCV,...
    CI_upperloc_SCV;CI_lowerCCE_Gopt_SCV,...
    CI_upperCCE_Gopt_SCV;CI_lowerIM_Gopt_SCV,...
    CI_upperIM_Gopt_SCV;KM_ciCRS_SCV];

    table_inference(:,6:7) = [CI_lower_loc,...
    CI_uppers_loc;CI_lowerCCE_Gopt,...
    CI_upperCCE_Gopt;CI_lowerIM_Gopt,...
    CI_upperIM_Gopt;KM_ciCRS];

    table_inference(:,8) = [max(group_location);...
    GstarB;IMstarB;CRSstarB];

       
  %% Print output

fprintf('\n             Cluster-based Inference\n');
fprintf(['      theta0    s.e.   t-stat        C.I.        C.I.(usual CV)',...
    '    G','      \n']);
fprintf(['Unit  %2.3f   %2.3f   %2.3f  [%2.3f',...
    ', %2.3f]  [%2.3f',...
    ', %2.3f]   %d     \n'],table_inference(1,1),table_inference(1,2),...
    table_inference(1,3),table_inference(1,4),table_inference(1,5),table_inference(1,6),table_inference(1,7), table_inference(1,8));

fprintf(['CCE   %2.3f   %2.3f   %2.3f  [%2.3f',...
    ', %2.3f]  [%2.3f',...
    ', %2.3f]     %d     \n'],table_inference(2,1),table_inference(2,2),...
    table_inference(2,3),table_inference(2,4),table_inference(2,5),table_inference(2,6),table_inference(2,7), table_inference(2,8));

fprintf(['IM    %2.3f   %2.3f   %2.3f  [%2.3f',...
    ', %2.3f]  [%2.3f',...
    ', %2.3f]     %d     \n'],table_inference(3,1),table_inference(3,2),...
    table_inference(3,3),table_inference(3,4),table_inference(3,5),table_inference(3,6),table_inference(3,7), table_inference(3,8));

fprintf(['CRS   %2.3f   %2.3f   %2.3f  [%2.3f',...
    ', %2.3f]  [%2.3f',...
    ', %2.3f]     %d     \n'],table_inference(4,1),table_inference(4,2),...
    table_inference(4,3),table_inference(4,4),table_inference(4,5),table_inference(4,6),table_inference(4,7), table_inference(4,8));

% table_inference  
    

%% Stop storing results
% diary OFF;

% bootssize_pvalsim=sum(pbootCRS<=ones(1000,G-1).*pvalsimCRS)/1000
% bootssize_pvalact=sum(pbootCRS<=ones(1000,G-1).*0.05)/1000

